{
"cells": [
{
"cell_type": "markdown",
"id": "45606137-025b-4875-ac1a-7d3ae24c4b89",
"metadata": {},
"source": [
"# Working with PDB Structures in Pandas\n",
"\n",
"Let's start by loading up a PDB structure from my favourite OPIG tool, the [STCRDab](https://opig.stats.ox.ac.uk/webapps/stcrdab-stcrpred/)! Here we will look at [PDB ID 6eqb](https://opig.stats.ox.ac.uk/webapps/stcrdab-stcrpred/StrViewer?pdb=6eqb). It is a crystal structure of a T cell receptor interacting with a peptide presented by an MHC class I molecule."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5403cb8a-d882-4633-b7ca-c5c340f37586",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a592f0bf500f4768a305cc00d83b4513",
"version_major": 2,
"version_minor": 0
},
"text/plain": []
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import nglview"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c1720ce4-8b63-41bb-afe9-776dc5d68cb5",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fc94934525a046a99bdb53b3d0ab0869",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"NGLWidget()"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"view = nglview.show_file('data/6eqb.pdb')\n",
"view"
]
},
{
"cell_type": "markdown",
"id": "d6c5e466-577f-467c-a0d6-666f7f31297c",
"metadata": {},
"source": [
"Now let's load this molecules into a pandas dataframe and do some analysis. The PDB data is loaded into columns in a similar way that PDB files are formatted as columns."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2c456cd0-1443-40f2-8d99-13f3063640aa",
"metadata": {},
"outputs": [],
"source": [
"from python_pdb.parsers import parse_pdb_to_pandas"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b6e4f141-4396-405f-8d76-6dc8c71bed9a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
record_type
\n",
"
atom_number
\n",
"
atom_name
\n",
"
alt_loc
\n",
"
residue_name
\n",
"
chain_id
\n",
"
residue_seq_id
\n",
"
residue_insert_code
\n",
"
pos_x
\n",
"
pos_y
\n",
"
pos_z
\n",
"
occupancy
\n",
"
b_factor
\n",
"
element
\n",
"
charge
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
ATOM
\n",
"
1
\n",
"
N
\n",
"
A
\n",
"
ALA
\n",
"
C
\n",
"
2
\n",
"
None
\n",
"
48.681
\n",
"
-11.013
\n",
"
29.600
\n",
"
0.5
\n",
"
30.86
\n",
"
N
\n",
"
None
\n",
"
\n",
"
\n",
"
1
\n",
"
ATOM
\n",
"
2
\n",
"
CA
\n",
"
A
\n",
"
ALA
\n",
"
C
\n",
"
2
\n",
"
None
\n",
"
49.343
\n",
"
-9.708
\n",
"
29.330
\n",
"
0.5
\n",
"
29.82
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
2
\n",
"
ATOM
\n",
"
3
\n",
"
C
\n",
"
A
\n",
"
ALA
\n",
"
C
\n",
"
2
\n",
"
None
\n",
"
49.310
\n",
"
-8.792
\n",
"
30.562
\n",
"
0.5
\n",
"
29.16
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
3
\n",
"
ATOM
\n",
"
4
\n",
"
O
\n",
"
A
\n",
"
ALA
\n",
"
C
\n",
"
2
\n",
"
None
\n",
"
48.668
\n",
"
-9.107
\n",
"
31.537
\n",
"
0.5
\n",
"
29.75
\n",
"
O
\n",
"
None
\n",
"
\n",
"
\n",
"
4
\n",
"
ATOM
\n",
"
5
\n",
"
CB
\n",
"
A
\n",
"
ALA
\n",
"
C
\n",
"
2
\n",
"
None
\n",
"
48.679
\n",
"
-9.044
\n",
"
28.146
\n",
"
0.5
\n",
"
29.57
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
6644
\n",
"
HETATM
\n",
"
6645
\n",
"
O
\n",
"
None
\n",
"
HOH
\n",
"
B
\n",
"
128
\n",
"
None
\n",
"
63.467
\n",
"
-9.052
\n",
"
4.232
\n",
"
1.0
\n",
"
46.86
\n",
"
O
\n",
"
None
\n",
"
\n",
"
\n",
"
6645
\n",
"
HETATM
\n",
"
6646
\n",
"
O
\n",
"
None
\n",
"
HOH
\n",
"
B
\n",
"
129
\n",
"
None
\n",
"
64.998
\n",
"
-3.887
\n",
"
10.735
\n",
"
1.0
\n",
"
48.05
\n",
"
O
\n",
"
None
\n",
"
\n",
"
\n",
"
6646
\n",
"
HETATM
\n",
"
6647
\n",
"
O
\n",
"
None
\n",
"
HOH
\n",
"
B
\n",
"
130
\n",
"
None
\n",
"
69.089
\n",
"
-37.773
\n",
"
1.079
\n",
"
1.0
\n",
"
63.55
\n",
"
O
\n",
"
None
\n",
"
\n",
"
\n",
"
6647
\n",
"
HETATM
\n",
"
6648
\n",
"
O
\n",
"
None
\n",
"
HOH
\n",
"
B
\n",
"
131
\n",
"
None
\n",
"
70.137
\n",
"
-40.643
\n",
"
1.067
\n",
"
1.0
\n",
"
59.75
\n",
"
O
\n",
"
None
\n",
"
\n",
"
\n",
"
6648
\n",
"
HETATM
\n",
"
6649
\n",
"
O
\n",
"
None
\n",
"
HOH
\n",
"
B
\n",
"
132
\n",
"
None
\n",
"
66.479
\n",
"
4.782
\n",
"
14.497
\n",
"
1.0
\n",
"
64.56
\n",
"
O
\n",
"
None
\n",
"
\n",
" \n",
"
\n",
"
6649 rows × 15 columns
\n",
"
"
],
"text/plain": [
" record_type atom_number atom_name alt_loc residue_name chain_id \\\n",
"0 ATOM 1 N A ALA C \n",
"1 ATOM 2 CA A ALA C \n",
"2 ATOM 3 C A ALA C \n",
"3 ATOM 4 O A ALA C \n",
"4 ATOM 5 CB A ALA C \n",
"... ... ... ... ... ... ... \n",
"6644 HETATM 6645 O None HOH B \n",
"6645 HETATM 6646 O None HOH B \n",
"6646 HETATM 6647 O None HOH B \n",
"6647 HETATM 6648 O None HOH B \n",
"6648 HETATM 6649 O None HOH B \n",
"\n",
" residue_seq_id residue_insert_code pos_x pos_y pos_z occupancy \\\n",
"0 2 None 48.681 -11.013 29.600 0.5 \n",
"1 2 None 49.343 -9.708 29.330 0.5 \n",
"2 2 None 49.310 -8.792 30.562 0.5 \n",
"3 2 None 48.668 -9.107 31.537 0.5 \n",
"4 2 None 48.679 -9.044 28.146 0.5 \n",
"... ... ... ... ... ... ... \n",
"6644 128 None 63.467 -9.052 4.232 1.0 \n",
"6645 129 None 64.998 -3.887 10.735 1.0 \n",
"6646 130 None 69.089 -37.773 1.079 1.0 \n",
"6647 131 None 70.137 -40.643 1.067 1.0 \n",
"6648 132 None 66.479 4.782 14.497 1.0 \n",
"\n",
" b_factor element charge \n",
"0 30.86 N None \n",
"1 29.82 C None \n",
"2 29.16 C None \n",
"3 29.75 O None \n",
"4 29.57 C None \n",
"... ... ... ... \n",
"6644 46.86 O None \n",
"6645 48.05 O None \n",
"6646 63.55 O None \n",
"6647 59.75 O None \n",
"6648 64.56 O None \n",
"\n",
"[6649 rows x 15 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open('data/6eqb.pdb', 'r') as fh:\n",
" df = parse_pdb_to_pandas(fh.read())\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "5450287e-8d14-4057-9cbd-0c477c43d9a1",
"metadata": {},
"source": [
"To start things off, let's clean up this structure by highlighting one of the most powerful aspects of this approach: querying. As you can seen from the data frame above- there are water molecules in the structure that we might not care about. Let's remove them..."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3b71804d-8bc1-41c8-ad29-9b07fa2b91c3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
record_type
\n",
"
atom_number
\n",
"
atom_name
\n",
"
alt_loc
\n",
"
residue_name
\n",
"
chain_id
\n",
"
residue_seq_id
\n",
"
residue_insert_code
\n",
"
pos_x
\n",
"
pos_y
\n",
"
pos_z
\n",
"
occupancy
\n",
"
b_factor
\n",
"
element
\n",
"
charge
\n",
"
\n",
" \n",
" \n",
"
\n",
"
6633
\n",
"
ATOM
\n",
"
6634
\n",
"
CB
\n",
"
None
\n",
"
MET
\n",
"
B
\n",
"
125
\n",
"
None
\n",
"
74.125
\n",
"
-32.337
\n",
"
9.293
\n",
"
1.0
\n",
"
121.02
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
6634
\n",
"
ATOM
\n",
"
6635
\n",
"
CG
\n",
"
None
\n",
"
MET
\n",
"
B
\n",
"
125
\n",
"
None
\n",
"
73.143
\n",
"
-31.632
\n",
"
10.263
\n",
"
1.0
\n",
"
116.33
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
6635
\n",
"
ATOM
\n",
"
6636
\n",
"
SD
\n",
"
None
\n",
"
MET
\n",
"
B
\n",
"
125
\n",
"
None
\n",
"
71.379
\n",
"
-32.103
\n",
"
10.373
\n",
"
1.0
\n",
"
113.91
\n",
"
S
\n",
"
None
\n",
"
\n",
"
\n",
"
6636
\n",
"
ATOM
\n",
"
6637
\n",
"
CE
\n",
"
None
\n",
"
MET
\n",
"
B
\n",
"
125
\n",
"
None
\n",
"
70.626
\n",
"
-31.028
\n",
"
9.134
\n",
"
1.0
\n",
"
109.06
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
6637
\n",
"
ATOM
\n",
"
6638
\n",
"
OXT
\n",
"
None
\n",
"
MET
\n",
"
B
\n",
"
125
\n",
"
None
\n",
"
74.839
\n",
"
-34.197
\n",
"
6.246
\n",
"
1.0
\n",
"
130.70
\n",
"
O
\n",
"
None
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" record_type atom_number atom_name alt_loc residue_name chain_id \\\n",
"6633 ATOM 6634 CB None MET B \n",
"6634 ATOM 6635 CG None MET B \n",
"6635 ATOM 6636 SD None MET B \n",
"6636 ATOM 6637 CE None MET B \n",
"6637 ATOM 6638 OXT None MET B \n",
"\n",
" residue_seq_id residue_insert_code pos_x pos_y pos_z occupancy \\\n",
"6633 125 None 74.125 -32.337 9.293 1.0 \n",
"6634 125 None 73.143 -31.632 10.263 1.0 \n",
"6635 125 None 71.379 -32.103 10.373 1.0 \n",
"6636 125 None 70.626 -31.028 9.134 1.0 \n",
"6637 125 None 74.839 -34.197 6.246 1.0 \n",
"\n",
" b_factor element charge \n",
"6633 121.02 C None \n",
"6634 116.33 C None \n",
"6635 113.91 S None \n",
"6636 109.06 C None \n",
"6637 130.70 O None "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_clean = df.query(\"record_type == 'ATOM'\") # or \"residue_name != 'HOH'\" would have worked as well\n",
"df_clean.tail()"
]
},
{
"cell_type": "markdown",
"id": "db2eeadf-075a-4d28-89b6-976cd07cc573",
"metadata": {},
"source": [
"By using pandas' built in querying function- we can easily get rid of the HETATMs in the file that we might not care about. We can also use this querying to just select the TCR or pMHC to perform analysis on this molecule seperately. In this example, the TCR $\\alpha$- and $\\beta$ chain is labelled as chains D and E repectively. The MHC molecule is chain A (B- representing the $\\beta_2$m) and C the peptide."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "65640cbd-c8be-4169-9e3f-fb0020625ff1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"REMARK 5 IMGT RENUMBERED STRUCTURE 6EQB GENERATED BY STCRDAB\n",
"REMARK 5 TCR CHAINS ARE RENUMBERED IN THE VARIABLE REGIONS ONLY\n",
"REMARK 5 MHC CHAINS ARE RENUMBERED IN THE G DOMAINS OR FOR B2M-GLOBULIN\n",
"REMARK 5 NON-TCR AND NON-MHC CHAINS ARE LEFT WITH RESIDUE IDS AS IN PDB\n",
"REMARK 5 PAIRED_ABTCR BCHAIN=E ACHAIN=D MHCCHAINS=AB AGCHAIN=C AGTYPE=PEPTIDE\n"
]
}
],
"source": [
"!head -n5 data/6eqb.pdb"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6c5fa0e7-f6bc-48e0-92bc-b81353f8589b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
record_type
\n",
"
atom_number
\n",
"
atom_name
\n",
"
alt_loc
\n",
"
residue_name
\n",
"
chain_id
\n",
"
residue_seq_id
\n",
"
residue_insert_code
\n",
"
pos_x
\n",
"
pos_y
\n",
"
pos_z
\n",
"
occupancy
\n",
"
b_factor
\n",
"
element
\n",
"
charge
\n",
"
\n",
" \n",
" \n",
"
\n",
"
57
\n",
"
ATOM
\n",
"
58
\n",
"
N
\n",
"
None
\n",
"
SER
\n",
"
E
\n",
"
1
\n",
"
None
\n",
"
44.786
\n",
"
19.936
\n",
"
31.694
\n",
"
1.0
\n",
"
91.71
\n",
"
N
\n",
"
None
\n",
"
\n",
"
\n",
"
58
\n",
"
ATOM
\n",
"
59
\n",
"
CA
\n",
"
None
\n",
"
SER
\n",
"
E
\n",
"
1
\n",
"
None
\n",
"
43.328
\n",
"
20.085
\n",
"
31.879
\n",
"
1.0
\n",
"
88.14
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
59
\n",
"
ATOM
\n",
"
60
\n",
"
C
\n",
"
None
\n",
"
SER
\n",
"
E
\n",
"
1
\n",
"
None
\n",
"
42.582
\n",
"
19.234
\n",
"
30.860
\n",
"
1.0
\n",
"
80.31
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
60
\n",
"
ATOM
\n",
"
61
\n",
"
O
\n",
"
None
\n",
"
SER
\n",
"
E
\n",
"
1
\n",
"
None
\n",
"
41.944
\n",
"
19.762
\n",
"
29.940
\n",
"
1.0
\n",
"
81.48
\n",
"
O
\n",
"
None
\n",
"
\n",
"
\n",
"
61
\n",
"
ATOM
\n",
"
62
\n",
"
CB
\n",
"
None
\n",
"
SER
\n",
"
E
\n",
"
1
\n",
"
None
\n",
"
42.912
\n",
"
21.571
\n",
"
31.778
\n",
"
1.0
\n",
"
93.53
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
3519
\n",
"
ATOM
\n",
"
3520
\n",
"
CD1
\n",
"
None
\n",
"
PHE
\n",
"
D
\n",
"
215
\n",
"
None
\n",
"
-5.784
\n",
"
33.229
\n",
"
67.928
\n",
"
1.0
\n",
"
154.28
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
3520
\n",
"
ATOM
\n",
"
3521
\n",
"
CD2
\n",
"
None
\n",
"
PHE
\n",
"
D
\n",
"
215
\n",
"
None
\n",
"
-6.202
\n",
"
32.551
\n",
"
65.649
\n",
"
1.0
\n",
"
145.03
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
3521
\n",
"
ATOM
\n",
"
3522
\n",
"
CE1
\n",
"
None
\n",
"
PHE
\n",
"
D
\n",
"
215
\n",
"
None
\n",
"
-6.684
\n",
"
34.292
\n",
"
67.786
\n",
"
1.0
\n",
"
160.08
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
3522
\n",
"
ATOM
\n",
"
3523
\n",
"
CE2
\n",
"
None
\n",
"
PHE
\n",
"
D
\n",
"
215
\n",
"
None
\n",
"
-7.092
\n",
"
33.613
\n",
"
65.506
\n",
"
1.0
\n",
"
151.52
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
3523
\n",
"
ATOM
\n",
"
3524
\n",
"
CZ
\n",
"
None
\n",
"
PHE
\n",
"
D
\n",
"
215
\n",
"
None
\n",
"
-7.336
\n",
"
34.483
\n",
"
66.576
\n",
"
1.0
\n",
"
159.15
\n",
"
C
\n",
"
None
\n",
"
\n",
" \n",
"
\n",
"
3461 rows × 15 columns
\n",
"
"
],
"text/plain": [
" record_type atom_number atom_name alt_loc residue_name chain_id \\\n",
"57 ATOM 58 N None SER E \n",
"58 ATOM 59 CA None SER E \n",
"59 ATOM 60 C None SER E \n",
"60 ATOM 61 O None SER E \n",
"61 ATOM 62 CB None SER E \n",
"... ... ... ... ... ... ... \n",
"3519 ATOM 3520 CD1 None PHE D \n",
"3520 ATOM 3521 CD2 None PHE D \n",
"3521 ATOM 3522 CE1 None PHE D \n",
"3522 ATOM 3523 CE2 None PHE D \n",
"3523 ATOM 3524 CZ None PHE D \n",
"\n",
" residue_seq_id residue_insert_code pos_x pos_y pos_z occupancy \\\n",
"57 1 None 44.786 19.936 31.694 1.0 \n",
"58 1 None 43.328 20.085 31.879 1.0 \n",
"59 1 None 42.582 19.234 30.860 1.0 \n",
"60 1 None 41.944 19.762 29.940 1.0 \n",
"61 1 None 42.912 21.571 31.778 1.0 \n",
"... ... ... ... ... ... ... \n",
"3519 215 None -5.784 33.229 67.928 1.0 \n",
"3520 215 None -6.202 32.551 65.649 1.0 \n",
"3521 215 None -6.684 34.292 67.786 1.0 \n",
"3522 215 None -7.092 33.613 65.506 1.0 \n",
"3523 215 None -7.336 34.483 66.576 1.0 \n",
"\n",
" b_factor element charge \n",
"57 91.71 N None \n",
"58 88.14 C None \n",
"59 80.31 C None \n",
"60 81.48 O None \n",
"61 93.53 C None \n",
"... ... ... ... \n",
"3519 154.28 C None \n",
"3520 145.03 C None \n",
"3521 160.08 C None \n",
"3522 151.52 C None \n",
"3523 159.15 C None \n",
"\n",
"[3461 rows x 15 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tcr_df = df_clean.query(\"chain_id == 'D' or chain_id == 'E'\")\n",
"tcr_df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "263ac0e5-f9e1-4901-9f65-da74b5b8d27a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
record_type
\n",
"
atom_number
\n",
"
atom_name
\n",
"
alt_loc
\n",
"
residue_name
\n",
"
chain_id
\n",
"
residue_seq_id
\n",
"
residue_insert_code
\n",
"
pos_x
\n",
"
pos_y
\n",
"
pos_z
\n",
"
occupancy
\n",
"
b_factor
\n",
"
element
\n",
"
charge
\n",
"
\n",
" \n",
" \n",
"
\n",
"
3541
\n",
"
ATOM
\n",
"
3542
\n",
"
N
\n",
"
None
\n",
"
GLY
\n",
"
A
\n",
"
1
\n",
"
None
\n",
"
63.937
\n",
"
-26.599
\n",
"
37.997
\n",
"
1.0
\n",
"
88.67
\n",
"
N
\n",
"
None
\n",
"
\n",
"
\n",
"
3542
\n",
"
ATOM
\n",
"
3543
\n",
"
CA
\n",
"
None
\n",
"
GLY
\n",
"
A
\n",
"
1
\n",
"
None
\n",
"
64.891
\n",
"
-25.601
\n",
"
38.583
\n",
"
1.0
\n",
"
88.18
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
3543
\n",
"
ATOM
\n",
"
3544
\n",
"
C
\n",
"
None
\n",
"
GLY
\n",
"
A
\n",
"
1
\n",
"
None
\n",
"
64.208
\n",
"
-24.275
\n",
"
38.448
\n",
"
1.0
\n",
"
82.85
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
3544
\n",
"
ATOM
\n",
"
3545
\n",
"
O
\n",
"
None
\n",
"
GLY
\n",
"
A
\n",
"
1
\n",
"
None
\n",
"
63.079
\n",
"
-24.143
\n",
"
38.876
\n",
"
1.0
\n",
"
81.66
\n",
"
O
\n",
"
None
\n",
"
\n",
"
\n",
"
3545
\n",
"
ATOM
\n",
"
3546
\n",
"
N
\n",
"
None
\n",
"
SER
\n",
"
A
\n",
"
2
\n",
"
None
\n",
"
64.856
\n",
"
-23.315
\n",
"
37.808
\n",
"
1.0
\n",
"
80.55
\n",
"
N
\n",
"
None
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
5790
\n",
"
ATOM
\n",
"
5791
\n",
"
O
\n",
"
None
\n",
"
PRO
\n",
"
A
\n",
"
1186
\n",
"
None
\n",
"
77.326
\n",
"
-47.026
\n",
"
15.302
\n",
"
1.0
\n",
"
147.25
\n",
"
O
\n",
"
None
\n",
"
\n",
"
\n",
"
5791
\n",
"
ATOM
\n",
"
5792
\n",
"
CB
\n",
"
None
\n",
"
PRO
\n",
"
A
\n",
"
1186
\n",
"
None
\n",
"
79.797
\n",
"
-49.588
\n",
"
14.420
\n",
"
1.0
\n",
"
159.44
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
5792
\n",
"
ATOM
\n",
"
5793
\n",
"
CG
\n",
"
None
\n",
"
PRO
\n",
"
A
\n",
"
1186
\n",
"
None
\n",
"
81.295
\n",
"
-49.731
\n",
"
14.585
\n",
"
1.0
\n",
"
160.29
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
5793
\n",
"
ATOM
\n",
"
5794
\n",
"
CD
\n",
"
None
\n",
"
PRO
\n",
"
A
\n",
"
1186
\n",
"
None
\n",
"
81.803
\n",
"
-48.494
\n",
"
15.300
\n",
"
1.0
\n",
"
158.37
\n",
"
C
\n",
"
None
\n",
"
\n",
"
\n",
"
5794
\n",
"
ATOM
\n",
"
5795
\n",
"
OXT
\n",
"
None
\n",
"
PRO
\n",
"
A
\n",
"
1186
\n",
"
None
\n",
"
77.441
\n",
"
-48.931
\n",
"
16.082
\n",
"
1.0
\n",
"
151.45
\n",
"
O
\n",
"
None
\n",
"
\n",
" \n",
"
\n",
"
2254 rows × 15 columns
\n",
"
"
],
"text/plain": [
" record_type atom_number atom_name alt_loc residue_name chain_id \\\n",
"3541 ATOM 3542 N None GLY A \n",
"3542 ATOM 3543 CA None GLY A \n",
"3543 ATOM 3544 C None GLY A \n",
"3544 ATOM 3545 O None GLY A \n",
"3545 ATOM 3546 N None SER A \n",
"... ... ... ... ... ... ... \n",
"5790 ATOM 5791 O None PRO A \n",
"5791 ATOM 5792 CB None PRO A \n",
"5792 ATOM 5793 CG None PRO A \n",
"5793 ATOM 5794 CD None PRO A \n",
"5794 ATOM 5795 OXT None PRO A \n",
"\n",
" residue_seq_id residue_insert_code pos_x pos_y pos_z occupancy \\\n",
"3541 1 None 63.937 -26.599 37.997 1.0 \n",
"3542 1 None 64.891 -25.601 38.583 1.0 \n",
"3543 1 None 64.208 -24.275 38.448 1.0 \n",
"3544 1 None 63.079 -24.143 38.876 1.0 \n",
"3545 2 None 64.856 -23.315 37.808 1.0 \n",
"... ... ... ... ... ... ... \n",
"5790 1186 None 77.326 -47.026 15.302 1.0 \n",
"5791 1186 None 79.797 -49.588 14.420 1.0 \n",
"5792 1186 None 81.295 -49.731 14.585 1.0 \n",
"5793 1186 None 81.803 -48.494 15.300 1.0 \n",
"5794 1186 None 77.441 -48.931 16.082 1.0 \n",
"\n",
" b_factor element charge \n",
"3541 88.67 N None \n",
"3542 88.18 C None \n",
"3543 82.85 C None \n",
"3544 81.66 O None \n",
"3545 80.55 N None \n",
"... ... ... ... \n",
"5790 147.25 O None \n",
"5791 159.44 C None \n",
"5792 160.29 C None \n",
"5793 158.37 C None \n",
"5794 151.45 O None \n",
"\n",
"[2254 rows x 15 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mhc_df = df_clean.query(\"chain_id == 'A'\")\n",
"mhc_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "d8d7c21a-9e77-4ae2-a62f-d9494ae73ef6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The peptide is a 9-mer!\n"
]
}
],
"source": [
"peptide_df = df_clean.query(\"chain_id == 'C'\")\n",
"\n",
"peptide_residues_df = peptide_df.groupby(['residue_seq_id', 'residue_insert_code'], dropna=False)\n",
"print(f'The peptide is a {len(peptide_residues_df)}-mer!')"
]
},
{
"cell_type": "markdown",
"id": "2aa9b138-a5de-4364-9354-a2ecdd6d84c4",
"metadata": {},
"source": [
"Another advantage of using pandas is we can add new columns to annotate properties in the structure that we care about. In this example, since the TCR is from STCRDab and has been renumbered using the [IMGT numbering convention](https://www.imgt.org/IMGTScientificChart/Nomenclature/IMGT-FRCDRdefinition.html) we can easily identify the complimentary determining regions based on their `residue_seq_id` property."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "cf2b8449-d99b-4fe2-b7dd-6d5e2e4480d9",
"metadata": {},
"outputs": [],
"source": [
"IMGT_CDR1 = set(range(27, 38 + 1))\n",
"IMGT_CDR2 = set(range(56, 65 + 1))\n",
"IMGT_CDR3 = set(range(105, 117 + 1))\n",
"\n",
"\n",
"def assign_cdr_number(seq_id: int) -> int | None:\n",
" '''\n",
" Map imgt_id to CDR domains, return number associated with domain or return None if input is not in a CDR\n",
" domain.\n",
" '''\n",
" if seq_id in IMGT_CDR1:\n",
" return 1\n",
"\n",
" if seq_id in IMGT_CDR2:\n",
" return 2\n",
"\n",
" if seq_id in IMGT_CDR3:\n",
" return 3\n",
"\n",
" return None\n",
"\n",
"tcr_df = tcr_df.copy() # Doing this on a copy of the dataframe since it is originally a slice of df!\n",
"tcr_df['cdr'] = tcr_df['residue_seq_id'].map(assign_cdr_number)"
]
},
{
"cell_type": "markdown",
"id": "6c88a852-fc6c-4778-8ddf-400cf8f0e71f",
"metadata": {},
"source": [
"We can also annotations for the $\\alpha$ and $\\beta$ chain since these are defined by the STCRDab header."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c146cbc8-0e20-4d92-94f3-9f15ab78f18d",
"metadata": {},
"outputs": [],
"source": [
"tcr_df['chain_type'] = tcr_df['chain_id'].map(lambda chain_id: 'alpha' if chain_id == 'D' else 'beta')"
]
},
{
"cell_type": "markdown",
"id": "de413b50-f62e-44c5-854f-e4305476872a",
"metadata": {},
"source": [
"Now we can easily get rich information about the TCR CDR loops with ease."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "17838d05-d15b-4c24-b413-6d24d31dfef1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"chain_type cdr\n",
"alpha 1.0 6\n",
" 2.0 6\n",
" 3.0 9\n",
"beta 1.0 6\n",
" 2.0 5\n",
" 3.0 13\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tcr_cdrs_df = tcr_df.query('cdr.notnull()')\n",
"\n",
"cdr_lengths = tcr_cdrs_df[\n",
" ['chain_type', 'cdr', 'residue_seq_id', 'residue_insert_code']\n",
"].drop_duplicates().groupby(['chain_type', 'cdr'], dropna=False).size()\n",
"\n",
"cdr_lengths"
]
},
{
"cell_type": "markdown",
"id": "85d75d9b-999f-45a4-84bf-81ff030f96d4",
"metadata": {},
"source": [
"We can also easily properties such as b-factors in the TCR variable domain."
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "420ebece-4820-4118-82ba-a86424838d72",
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "47f76eb6-e44e-473e-b902-954d6eea0dc2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tcr_variable_df = tcr_df.query('residue_seq_id <= 129')\n",
"sns.lineplot(x=tcr_variable_df['residue_seq_id'], y=tcr_variable_df['b_factor'])"
]
},
{
"cell_type": "markdown",
"id": "02751e2a-5907-4af5-bfb9-c1769650d906",
"metadata": {},
"source": [
"Computing things like contacting residues between the TCR CDR loops and the peptide is a breeze."
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "0cb3bbc1-3d3f-4b1d-98a0-26603a94d344",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "9d8c8519-2eea-466b-8b9b-6af474984acf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
chain_id_tcr
\n",
"
residue_seq_id_tcr
\n",
"
residue_insert_code_tcr
\n",
"
cdr
\n",
"
chain_type
\n",
"
residue_seq_id_peptide
\n",
"
residue_insert_code_peptide
\n",
"
\n",
" \n",
" \n",
"
\n",
"
6424
\n",
"
E
\n",
"
108
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
8
\n",
"
None
\n",
"
\n",
"
\n",
"
6603
\n",
"
E
\n",
"
109
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
9
\n",
"
None
\n",
"
\n",
"
\n",
"
6641
\n",
"
E
\n",
"
109
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
7
\n",
"
None
\n",
"
\n",
"
\n",
"
6704
\n",
"
E
\n",
"
109
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
8
\n",
"
None
\n",
"
\n",
"
\n",
"
6926
\n",
"
E
\n",
"
110
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
7
\n",
"
None
\n",
"
\n",
"
\n",
"
7151
\n",
"
E
\n",
"
111
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
7
\n",
"
None
\n",
"
\n",
"
\n",
"
7194
\n",
"
E
\n",
"
111
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
4
\n",
"
None
\n",
"
\n",
"
\n",
"
7253
\n",
"
E
\n",
"
111
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
5
\n",
"
None
\n",
"
\n",
"
\n",
"
7261
\n",
"
E
\n",
"
111
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
6
\n",
"
None
\n",
"
\n",
"
\n",
"
7280
\n",
"
E
\n",
"
111
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
8
\n",
"
None
\n",
"
\n",
"
\n",
"
7361
\n",
"
E
\n",
"
111
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
3
\n",
"
None
\n",
"
\n",
"
\n",
"
7594
\n",
"
E
\n",
"
112
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
4
\n",
"
None
\n",
"
\n",
"
\n",
"
7596
\n",
"
E
\n",
"
112
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
5
\n",
"
None
\n",
"
\n",
"
\n",
"
7603
\n",
"
E
\n",
"
112
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
6
\n",
"
None
\n",
"
\n",
"
\n",
"
7607
\n",
"
E
\n",
"
112
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
7
\n",
"
None
\n",
"
\n",
"
\n",
"
7622
\n",
"
E
\n",
"
112
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
8
\n",
"
None
\n",
"
\n",
"
\n",
"
8114
\n",
"
E
\n",
"
113
\n",
"
None
\n",
"
3.0
\n",
"
beta
\n",
"
5
\n",
"
None
\n",
"
\n",
"
\n",
"
11763
\n",
"
D
\n",
"
37
\n",
"
None
\n",
"
1.0
\n",
"
alpha
\n",
"
5
\n",
"
None
\n",
"
\n",
"
\n",
"
11923
\n",
"
D
\n",
"
37
\n",
"
None
\n",
"
1.0
\n",
"
alpha
\n",
"
4
\n",
"
None
\n",
"
\n",
"
\n",
"
12030
\n",
"
D
\n",
"
37
\n",
"
None
\n",
"
1.0
\n",
"
alpha
\n",
"
2
\n",
"
None
\n",
"
\n",
"
\n",
"
12033
\n",
"
D
\n",
"
37
\n",
"
None
\n",
"
1.0
\n",
"
alpha
\n",
"
3
\n",
"
None
\n",
"
\n",
"
\n",
"
12218
\n",
"
D
\n",
"
38
\n",
"
None
\n",
"
1.0
\n",
"
alpha
\n",
"
5
\n",
"
None
\n",
"
\n",
"
\n",
"
13131
\n",
"
D
\n",
"
57
\n",
"
None
\n",
"
2.0
\n",
"
alpha
\n",
"
5
\n",
"
None
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" chain_id_tcr residue_seq_id_tcr residue_insert_code_tcr cdr \\\n",
"6424 E 108 None 3.0 \n",
"6603 E 109 None 3.0 \n",
"6641 E 109 None 3.0 \n",
"6704 E 109 None 3.0 \n",
"6926 E 110 None 3.0 \n",
"7151 E 111 None 3.0 \n",
"7194 E 111 None 3.0 \n",
"7253 E 111 None 3.0 \n",
"7261 E 111 None 3.0 \n",
"7280 E 111 None 3.0 \n",
"7361 E 111 None 3.0 \n",
"7594 E 112 None 3.0 \n",
"7596 E 112 None 3.0 \n",
"7603 E 112 None 3.0 \n",
"7607 E 112 None 3.0 \n",
"7622 E 112 None 3.0 \n",
"8114 E 113 None 3.0 \n",
"11763 D 37 None 1.0 \n",
"11923 D 37 None 1.0 \n",
"12030 D 37 None 1.0 \n",
"12033 D 37 None 1.0 \n",
"12218 D 38 None 1.0 \n",
"13131 D 57 None 2.0 \n",
"\n",
" chain_type residue_seq_id_peptide residue_insert_code_peptide \n",
"6424 beta 8 None \n",
"6603 beta 9 None \n",
"6641 beta 7 None \n",
"6704 beta 8 None \n",
"6926 beta 7 None \n",
"7151 beta 7 None \n",
"7194 beta 4 None \n",
"7253 beta 5 None \n",
"7261 beta 6 None \n",
"7280 beta 8 None \n",
"7361 beta 3 None \n",
"7594 beta 4 None \n",
"7596 beta 5 None \n",
"7603 beta 6 None \n",
"7607 beta 7 None \n",
"7622 beta 8 None \n",
"8114 beta 5 None \n",
"11763 alpha 5 None \n",
"11923 alpha 4 None \n",
"12030 alpha 2 None \n",
"12033 alpha 3 None \n",
"12218 alpha 5 None \n",
"13131 alpha 5 None "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"CONTACT_DISTANCE = 5 # Angstroms (Å)\n",
"\n",
"def euclidean_distance(x1, y1, z1, x2, y2, z2):\n",
" return np.sqrt((x2 - x1)**2 + (y2 - y1)**2 + (z2 - z1)**2)\n",
"\n",
"\n",
"interface = tcr_cdrs_df.merge(peptide_df, how='cross', suffixes=('_tcr', '_peptide'))\n",
"interface['atom_distances'] = euclidean_distance(interface['pos_x_tcr'], interface['pos_y_tcr'], interface['pos_z_tcr'],\n",
" interface['pos_x_peptide'], interface['pos_y_peptide'], interface['pos_z_peptide'])\n",
"\n",
"contacting_atoms = interface[interface['atom_distances'] <= CONTACT_DISTANCE]\n",
"contacting_residues = contacting_atoms[['chain_id_tcr', 'residue_seq_id_tcr', 'residue_insert_code_tcr', 'cdr', 'chain_type',\n",
" 'residue_seq_id_peptide', 'residue_insert_code_peptide']].drop_duplicates()\n",
"\n",
"contacting_residues"
]
},
{
"cell_type": "markdown",
"id": "0974854b-e4d7-487d-8487-1986ddca94e1",
"metadata": {},
"source": [
"Finally, if we want to save part of the data frame as a PDB file, we can convert it back to a `Structure` object and save it to a file. "
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "7e13b11d-fea0-4a81-a66f-5f220b64cdcd",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"from python_pdb.entities import Structure, StructureConstructionWarning"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "6aad2173-f2e5-4576-89cc-4068fc78a0ca",
"metadata": {},
"outputs": [],
"source": [
"# Suppressing warnings here because there are alternate locations specified in this PDB file\n",
"with warnings.catch_warnings():\n",
" warnings.filterwarnings('ignore', category=StructureConstructionWarning)\n",
" tcr_structure = Structure.from_pandas(tcr_df)\n",
"\n",
"with open('tcr.pdb', 'w') as fh:\n",
" fh.write(str(tcr_structure))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8e71c878-c387-40c4-bef3-f097831122d7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tcr.pdb\n"
]
}
],
"source": [
"!ls | grep '.pdb'"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "b36cde25-4f1e-413d-a4d0-ebeb39caf303",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ATOM 58 N SER E 1 44.786 19.936 31.694 1.00 91.71 N \n",
"ATOM 59 CA SER E 1 43.328 20.085 31.879 1.00 88.14 C \n",
"ATOM 60 C SER E 1 42.582 19.234 30.860 1.00 80.31 C \n",
"ATOM 61 O SER E 1 41.944 19.762 29.940 1.00 81.48 O \n",
"ATOM 62 CB SER E 1 42.912 21.571 31.778 1.00 93.53 C \n",
"ATOM 63 OG SER E 1 41.483 21.690 31.841 1.00 93.62 O \n",
"ATOM 64 N GLN E 2 42.634 17.921 31.043 1.00 72.67 N \n",
"ATOM 65 CA GLN E 2 41.716 17.039 30.308 1.00 69.30 C \n",
"ATOM 66 C GLN E 2 40.246 17.307 30.674 1.00 66.83 C \n",
"ATOM 67 O GLN E 2 39.999 17.911 31.668 1.00 70.99 O \n"
]
}
],
"source": [
"!head tcr.pdb"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}